//
// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
//
// SPDX-License-Identifier: Apache-2.0
//

// Do not flag up inline assembly blocks
#pragma GCC diagnostic ignored "-Woverlength-strings"

#if !defined(__ARM_FEATURE_MATMUL_INT8)
#error "I8mm extension required to compile this micro-kernel"
#else
#include "kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm.h"

#include <arm_neon.h>
#include <stddef.h>
#include <stdint.h>

#include "kai/kai_common.h"

static const size_t kai_m_step = 8;
static const size_t kai_n_step = 8;
static const size_t kai_mr = 4;
static const size_t kai_nr = 8;
static const size_t kai_kr = 16;
static const size_t kai_sr = 2;
static const size_t kai_num_bytes_multiplier_lhs = sizeof(float);
static const size_t kai_num_bytes_multiplier_rhs = sizeof(float);
static const size_t kai_num_bytes_offset_lhs = sizeof(int32_t);
static const size_t kai_num_bytes_sum_rhs = sizeof(int32_t);
static const size_t kai_num_bytes_bias = sizeof(float);

inline static size_t kai_k_roundedup(size_t k) {
    // Since we pack a float and int32 value at the end of the row,
    // we must make sure that k is a multiple of 4 for alignment
    size_t kr_sr_roundedup4 = kai_roundup(kai_kr * kai_sr, 4);
    return kai_roundup(k, kr_sr_roundedup4);
}

inline static size_t kai_lhs_packed_stride(size_t k) {
    const size_t k_internal = kai_k_roundedup(k);

    KAI_ASSERT((k_internal % 2) == 0);

    return kai_mr * (k_internal * sizeof(int8_t) + kai_num_bytes_multiplier_lhs + kai_num_bytes_offset_lhs);
}

inline static size_t kai_rhs_packed_stride(size_t k) {
    const size_t k_internal = kai_k_roundedup(k);

    KAI_ASSERT((k_internal % 2) == 0);

    return kai_nr * ((k_internal / 2) + kai_num_bytes_multiplier_rhs + kai_num_bytes_sum_rhs + kai_num_bytes_bias);
}

size_t kai_get_m_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm(void) {
    return kai_m_step;
}

size_t kai_get_n_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm(void) {
    return kai_n_step;
}

size_t kai_get_mr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm(void) {
    return kai_mr;
}

size_t kai_get_nr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm(void) {
    return kai_nr;
}

size_t kai_get_kr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm(void) {
    return kai_kr;
}

size_t kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm(void) {
    return kai_sr;
}

size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm(size_t m_idx, size_t k) {
    KAI_ASSERT((m_idx % kai_m_step) == 0);

    return (m_idx / kai_mr) * kai_lhs_packed_stride(k);
}

size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm(size_t n_idx, size_t k) {
    KAI_ASSERT((n_idx % kai_n_step) == 0);

    return (n_idx / kai_nr) * kai_rhs_packed_stride(k);
}

size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm(
    size_t m_idx, size_t n_idx, size_t dst_stride) {
    KAI_ASSERT((m_idx % kai_m_step) == 0);
    KAI_ASSERT((n_idx % kai_n_step) == 0);

    return (n_idx * sizeof(float)) + m_idx * dst_stride;
}

size_t kai_get_dst_size_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm(size_t m, size_t n) {
    return m * n * sizeof(float);
}

void kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm(
    size_t m, size_t n, size_t k, const void* lhs_packed, const void* rhs_packed,
    float* dst,  // NOLINT(readability-non-const-parameter)
    size_t dst_stride_row, size_t dst_stride_col, float scalar_min, float scalar_max) {
    KAI_ASSERT(dst_stride_col == sizeof(float));

    if (m == 0) {
        return;
    }

    const size_t k_internal = kai_k_roundedup(k);

    size_t num_blocks = k_internal / 32;

    float clamp_vals[2] = {scalar_min, scalar_max};

    __asm__ __volatile__(
        "mov x12, %x[m]\n"
        "mov x11, #0x80\n"
        "movi v24.16b, #0xf0\n"
        "mov x20, #0x20\n"
        "cmp x12, #0x8\n"
        "madd x11, %x[num_blocks], x11, x20\n"
        "blt 12f\n"
        "1:"  // Row loop
        "mov x10, %x[rhs_packed]\n"
        "mov x9, %x[n]\n"
        "add x28, %x[dst], %x[dst_stride_row], LSL #3\n"
        "2:"  // Column loop
        "mov x22, %x[lhs_packed]\n"
        "movi v13.4s, #0x0\n"
        "movi v22.4s, #0x0\n"
        "mov x21, %x[num_blocks]\n"
        "movi v11.4s, #0x0\n"
        "movi v15.4s, #0x0\n"
        "movi v14.4s, #0x0\n"
        "movi v17.4s, #0x0\n"
        "add x20, x22, x11\n"
        "movi v1.4s, #0x0\n"
        "movi v8.4s, #0x0\n"
        "movi v30.4s, #0x0\n"
        "movi v28.4s, #0x0\n"
        "movi v10.4s, #0x0\n"
        "movi v27.4s, #0x0\n"
        "movi v18.4s, #0x0\n"
        "movi v12.4s, #0x0\n"
        "movi v4.4s, #0x0\n"
        "movi v26.4s, #0x0\n"
        "3:"  // Sub block loop
        "ldr q31, [x10, #0x0]\n"
        "ldr q3, [x10, #0x10]\n"
        "subs x21, x21, #0x1\n"
        "ldr q6, [x10, #0x20]\n"
        "ldr q25, [x10, #0x30]\n"
        "ldr q19, [x22, #0x0]\n"
        "ldr q7, [x22, #0x10]\n"
        "ldr q20, [x20, #0x0]\n"
        "ldr q5, [x20, #0x10]\n"
        "shl v21.16b, v31.16b, #0x4\n"
        "shl v2.16b, v3.16b, #0x4\n"
        "ldr q16, [x10, #0x40]\n"
        "ldr q23, [x10, #0x50]\n"
        "shl v9.16b, v6.16b, #0x4\n"
        "shl v29.16b, v25.16b, #0x4\n"
        "ldr q0, [x10, #0x60]\n"
        "and v31.16b, v31.16b, v24.16b\n"
        "and v3.16b, v3.16b, v24.16b\n"
        ".inst 0x4e95a66d  // smmla v13.4s, v19.16b, v21.16b\n"
        ".inst 0x4e82a66b  // smmla v11.4s, v19.16b, v2.16b\n"
        ".inst 0x4e95a4ee  // smmla v14.4s, v7.16b, v21.16b\n"
        "and v6.16b, v6.16b, v24.16b\n"
        ".inst 0x4e89a676  // smmla v22.4s, v19.16b, v9.16b\n"
        ".inst 0x4e9da66f  // smmla v15.4s, v19.16b, v29.16b\n"
        "ldr q19, [x10, #0x70]\n"
        "and v25.16b, v25.16b, v24.16b\n"
        ".inst 0x4e82a4e1  // smmla v1.4s, v7.16b, v2.16b\n"
        ".inst 0x4e89a4f1  // smmla v17.4s, v7.16b, v9.16b\n"
        "add x10, x10, #0x80\n"
        ".inst 0x4e9da4e8  // smmla v8.4s, v7.16b, v29.16b\n"
        "ldr q7, [x22, #0x20]\n"
        ".inst 0x4e95a69e  // smmla v30.4s, v20.16b, v21.16b\n"
        ".inst 0x4e82a68a  // smmla v10.4s, v20.16b, v2.16b\n"
        ".inst 0x4e89a69c  // smmla v28.4s, v20.16b, v9.16b\n"
        ".inst 0x4e9da69b  // smmla v27.4s, v20.16b, v29.16b\n"
        "ldr q20, [x22, #0x30]\n"
        ".inst 0x4e95a4b2  // smmla v18.4s, v5.16b, v21.16b\n"
        "ldr q21, [x20, #0x20]\n"
        ".inst 0x4e82a4a4  // smmla v4.4s, v5.16b, v2.16b\n"
        "ldr q2, [x20, #0x30]\n"
        ".inst 0x4e89a4ac  // smmla v12.4s, v5.16b, v9.16b\n"
        "ldr q9, [x22, #0x40]\n"
        ".inst 0x4e9da4ba  // smmla v26.4s, v5.16b, v29.16b\n"
        "ldr q29, [x22, #0x50]\n"
        "shl v5.16b, v16.16b, #0x4\n"
        "and v16.16b, v16.16b, v24.16b\n"
        ".inst 0x4e85a4ed  // smmla v13.4s, v7.16b, v5.16b\n"
        ".inst 0x4e85a68e  // smmla v14.4s, v20.16b, v5.16b\n"
        ".inst 0x4e85a6be  // smmla v30.4s, v21.16b, v5.16b\n"
        ".inst 0x4e85a452  // smmla v18.4s, v2.16b, v5.16b\n"
        "shl v5.16b, v23.16b, #0x4\n"
        "and v23.16b, v23.16b, v24.16b\n"
        ".inst 0x4e85a4eb  // smmla v11.4s, v7.16b, v5.16b\n"
        ".inst 0x4e85a681  // smmla v1.4s, v20.16b, v5.16b\n"
        ".inst 0x4e85a6aa  // smmla v10.4s, v21.16b, v5.16b\n"
        ".inst 0x4e85a444  // smmla v4.4s, v2.16b, v5.16b\n"
        "shl v5.16b, v0.16b, #0x4\n"
        ".inst 0x4e9fa52d  // smmla v13.4s, v9.16b, v31.16b\n"
        ".inst 0x4e9fa7ae  // smmla v14.4s, v29.16b, v31.16b\n"
        "and v0.16b, v0.16b, v24.16b\n"
        ".inst 0x4e85a4f6  // smmla v22.4s, v7.16b, v5.16b\n"
        ".inst 0x4e85a691  // smmla v17.4s, v20.16b, v5.16b\n"
        ".inst 0x4e85a6bc  // smmla v28.4s, v21.16b, v5.16b\n"
        ".inst 0x4e85a44c  // smmla v12.4s, v2.16b, v5.16b\n"
        "shl v5.16b, v19.16b, #0x4\n"
        ".inst 0x4e83a52b  // smmla v11.4s, v9.16b, v3.16b\n"
        ".inst 0x4e83a7a1  // smmla v1.4s, v29.16b, v3.16b\n"
        "and v19.16b, v19.16b, v24.16b\n"
        ".inst 0x4e85a4ef  // smmla v15.4s, v7.16b, v5.16b\n"
        "ldr q7, [x20, #0x40]\n"
        ".inst 0x4e85a688  // smmla v8.4s, v20.16b, v5.16b\n"
        "ldr q20, [x20, #0x50]\n"
        ".inst 0x4e85a6bb  // smmla v27.4s, v21.16b, v5.16b\n"
        "ldr q21, [x22, #0x60]\n"
        ".inst 0x4e85a45a  // smmla v26.4s, v2.16b, v5.16b\n"
        "ldr q5, [x22, #0x70]\n"
        "ldr q2, [x20, #0x60]\n"
        ".inst 0x4e86a536  // smmla v22.4s, v9.16b, v6.16b\n"
        ".inst 0x4e86a7b1  // smmla v17.4s, v29.16b, v6.16b\n"
        "add x22, x22, #0x80\n"
        ".inst 0x4e9fa4fe  // smmla v30.4s, v7.16b, v31.16b\n"
        ".inst 0x4e83a4ea  // smmla v10.4s, v7.16b, v3.16b\n"
        ".inst 0x4e99a52f  // smmla v15.4s, v9.16b, v25.16b\n"
        "ldr q9, [x20, #0x70]\n"
        ".inst 0x4e99a7a8  // smmla v8.4s, v29.16b, v25.16b\n"
        "add x20, x20, #0x80\n"
        ".inst 0x4e86a4fc  // smmla v28.4s, v7.16b, v6.16b\n"
        ".inst 0x4e99a4fb  // smmla v27.4s, v7.16b, v25.16b\n"
        ".inst 0x4e9fa692  // smmla v18.4s, v20.16b, v31.16b\n"
        ".inst 0x4e83a684  // smmla v4.4s, v20.16b, v3.16b\n"
        ".inst 0x4e86a68c  // smmla v12.4s, v20.16b, v6.16b\n"
        ".inst 0x4e99a69a  // smmla v26.4s, v20.16b, v25.16b\n"
        ".inst 0x4e90a6ad  // smmla v13.4s, v21.16b, v16.16b\n"
        ".inst 0x4e97a6ab  // smmla v11.4s, v21.16b, v23.16b\n"
        ".inst 0x4e80a6b6  // smmla v22.4s, v21.16b, v0.16b\n"
        ".inst 0x4e93a6af  // smmla v15.4s, v21.16b, v19.16b\n"
        ".inst 0x4e90a4ae  // smmla v14.4s, v5.16b, v16.16b\n"
        ".inst 0x4e97a4a1  // smmla v1.4s, v5.16b, v23.16b\n"
        ".inst 0x4e80a4b1  // smmla v17.4s, v5.16b, v0.16b\n"
        ".inst 0x4e93a4a8  // smmla v8.4s, v5.16b, v19.16b\n"
        ".inst 0x4e90a45e  // smmla v30.4s, v2.16b, v16.16b\n"
        ".inst 0x4e97a44a  // smmla v10.4s, v2.16b, v23.16b\n"
        ".inst 0x4e80a45c  // smmla v28.4s, v2.16b, v0.16b\n"
        ".inst 0x4e93a45b  // smmla v27.4s, v2.16b, v19.16b\n"
        ".inst 0x4e90a532  // smmla v18.4s, v9.16b, v16.16b\n"
        ".inst 0x4e97a524  // smmla v4.4s, v9.16b, v23.16b\n"
        ".inst 0x4e80a52c  // smmla v12.4s, v9.16b, v0.16b\n"
        ".inst 0x4e93a53a  // smmla v26.4s, v9.16b, v19.16b\n"
        "bgt 3b\n"
        "ldr q5, [x10, #0x0]\n"
        "ldr q19, [x10, #0x10]\n"
        "uzp1 v2.2d, v13.2d, v11.2d\n"
        "uzp2 v20.2d, v13.2d, v11.2d\n"
        "ld1 { v11.4s }, [x22]\n"
        "ldr q23, [x10, #0x20]\n"
        "uzp1 v9.2d, v22.2d, v15.2d\n"
        "uzp2 v29.2d, v22.2d, v15.2d\n"
        "ldr q6, [x10, #0x30]\n"
        "uzp1 v31.2d, v14.2d, v1.2d\n"
        "uzp2 v7.2d, v14.2d, v1.2d\n"
        "add x22, x22, #0x10\n"
        "ldr q22, [x22, #0x0]\n"
        "uzp1 v0.2d, v17.2d, v8.2d\n"
        "uzp2 v16.2d, v17.2d, v8.2d\n"
        "add x10, x10, #0x40\n"
        "mla v2.4s, v5.4s, v11.s[0]\n"
        "mla v9.4s, v19.4s, v11.s[0]\n"
        "mla v20.4s, v5.4s, v11.s[1]\n"
        "mla v29.4s, v19.4s, v11.s[1]\n"
        "mla v31.4s, v5.4s, v11.s[2]\n"
        "mla v0.4s, v19.4s, v11.s[2]\n"
        "fmul v15.4s, v23.4s, v22.s[0]\n"
        "mla v7.4s, v5.4s, v11.s[3]\n"
        "mla v16.4s, v19.4s, v11.s[3]\n"
        "fmul v11.4s, v6.4s, v22.s[0]\n"
        "scvtf v2.4s, v2.4s\n"
        "scvtf v9.4s, v9.4s\n"
        "fmul v25.4s, v23.4s, v22.s[1]\n"
        "scvtf v20.4s, v20.4s\n"
        "fmul v14.4s, v6.4s, v22.s[1]\n"
        "scvtf v29.4s, v29.4s\n"
        "fmul v1.4s, v23.4s, v22.s[2]\n"
        "scvtf v31.4s, v31.4s\n"
        "fmul v17.4s, v6.4s, v22.s[2]\n"
        "scvtf v0.4s, v0.4s\n"
        "fmul v21.4s, v23.4s, v22.s[3]\n"
        "scvtf v7.4s, v7.4s\n"
        "fmul v3.4s, v6.4s, v22.s[3]\n"
        "scvtf v16.4s, v16.4s\n"
        "fmul v13.4s, v2.4s, v15.4s\n"
        "fmul v22.4s, v9.4s, v11.4s\n"
        "fmul v11.4s, v20.4s, v25.4s\n"
        "fmul v15.4s, v29.4s, v14.4s\n"
        "fmul v14.4s, v31.4s, v1.4s\n"
        "fmul v17.4s, v0.4s, v17.4s\n"
        "fmul v1.4s, v7.4s, v21.4s\n"
        "fmul v8.4s, v16.4s, v3.4s\n"
        "ld1 { v20.4s }, [x20]\n"
        "uzp1 v2.2d, v30.2d, v10.2d\n"
        "uzp2 v10.2d, v30.2d, v10.2d\n"
        "add x20, x20, #0x10\n"
        "ldr q3, [x20, #0x0]\n"
        "uzp1 v0.2d, v28.2d, v27.2d\n"
        "uzp2 v31.2d, v28.2d, v27.2d\n"
        "uzp1 v29.2d, v18.2d, v4.2d\n"
        "uzp2 v9.2d, v18.2d, v4.2d\n"
        "uzp1 v4.2d, v12.2d, v26.2d\n"
        "uzp2 v21.2d, v12.2d, v26.2d\n"
        "mla v2.4s, v5.4s, v20.s[0]\n"
        "mla v0.4s, v19.4s, v20.s[0]\n"
        "mla v10.4s, v5.4s, v20.s[1]\n"
        "fmul v30.4s, v23.4s, v3.s[0]\n"
        "mla v31.4s, v19.4s, v20.s[1]\n"
        "mla v29.4s, v5.4s, v20.s[2]\n"
        "fmul v7.4s, v6.4s, v3.s[0]\n"
        "mla v4.4s, v19.4s, v20.s[2]\n"
        "mla v9.4s, v5.4s, v20.s[3]\n"
        "fmul v18.4s, v23.4s, v3.s[1]\n"
        "mla v21.4s, v19.4s, v20.s[3]\n"
        "scvtf v2.4s, v2.4s\n"
        "scvtf v0.4s, v0.4s\n"
        "scvtf v10.4s, v10.4s\n"
        "fmul v27.4s, v6.4s, v3.s[1]\n"
        "scvtf v31.4s, v31.4s\n"
        "fmul v20.4s, v23.4s, v3.s[2]\n"
        "scvtf v29.4s, v29.4s\n"
        "fmul v19.4s, v6.4s, v3.s[2]\n"
        "scvtf v4.4s, v4.4s\n"
        "fmul v23.4s, v23.4s, v3.s[3]\n"
        "scvtf v9.4s, v9.4s\n"
        "fmul v6.4s, v6.4s, v3.s[3]\n"
        "scvtf v21.4s, v21.4s\n"
        "fmul v30.4s, v2.4s, v30.4s\n"
        "fmul v28.4s, v0.4s, v7.4s\n"
        "fmul v10.4s, v10.4s, v18.4s\n"
        "fmul v27.4s, v31.4s, v27.4s\n"
        "fmul v18.4s, v29.4s, v20.4s\n"
        "fmul v12.4s, v4.4s, v19.4s\n"
        "fmul v4.4s, v9.4s, v23.4s\n"
        "fmul v26.4s, v21.4s, v6.4s\n"
        "ldr q20, [x10, #0x0]\n"
        "ldr q19, [x10, #0x10]\n"
        "add x20, %x[clamp_vals], #0x4\n"
        "cmp x9, #0x8\n"
        "ld1r { v9.4s }, [%x[clamp_vals]]\n"
        "ld1r { v6.4s }, [x20]\n"
        "add x10, x10, #0x20\n"
        "fadd v13.4s, v13.4s, v20.4s\n"
        "fadd v22.4s, v22.4s, v19.4s\n"
        "fadd v11.4s, v11.4s, v20.4s\n"
        "fadd v15.4s, v15.4s, v19.4s\n"
        "fadd v14.4s, v14.4s, v20.4s\n"
        "fadd v17.4s, v17.4s, v19.4s\n"
        "fadd v1.4s, v1.4s, v20.4s\n"
        "fadd v8.4s, v8.4s, v19.4s\n"
        "fadd v30.4s, v30.4s, v20.4s\n"
        "fadd v28.4s, v28.4s, v19.4s\n"
        "fadd v10.4s, v10.4s, v20.4s\n"
        "fadd v27.4s, v27.4s, v19.4s\n"
        "fadd v18.4s, v18.4s, v20.4s\n"
        "fadd v12.4s, v12.4s, v19.4s\n"
        "fadd v4.4s, v4.4s, v20.4s\n"
        "fadd v26.4s, v26.4s, v19.4s\n"
        "fmax v13.4s, v13.4s, v9.4s\n"
        "fmax v22.4s, v22.4s, v9.4s\n"
        "fmax v11.4s, v11.4s, v9.4s\n"
        "fmax v15.4s, v15.4s, v9.4s\n"
        "fmax v14.4s, v14.4s, v9.4s\n"
        "fmax v17.4s, v17.4s, v9.4s\n"
        "fmax v1.4s, v1.4s, v9.4s\n"
        "fmax v8.4s, v8.4s, v9.4s\n"
        "fmax v30.4s, v30.4s, v9.4s\n"
        "fmax v28.4s, v28.4s, v9.4s\n"
        "fmax v10.4s, v10.4s, v9.4s\n"
        "fmax v27.4s, v27.4s, v9.4s\n"
        "fmax v18.4s, v18.4s, v9.4s\n"
        "fmax v12.4s, v12.4s, v9.4s\n"
        "fmax v4.4s, v4.4s, v9.4s\n"
        "fmax v26.4s, v26.4s, v9.4s\n"
        "fmin v13.4s, v13.4s, v6.4s\n"
        "fmin v22.4s, v22.4s, v6.4s\n"
        "fmin v11.4s, v11.4s, v6.4s\n"
        "fmin v15.4s, v15.4s, v6.4s\n"
        "fmin v14.4s, v14.4s, v6.4s\n"
        "fmin v17.4s, v17.4s, v6.4s\n"
        "fmin v1.4s, v1.4s, v6.4s\n"
        "fmin v8.4s, v8.4s, v6.4s\n"
        "fmin v30.4s, v30.4s, v6.4s\n"
        "fmin v28.4s, v28.4s, v6.4s\n"
        "fmin v10.4s, v10.4s, v6.4s\n"
        "fmin v27.4s, v27.4s, v6.4s\n"
        "fmin v18.4s, v18.4s, v6.4s\n"
        "fmin v12.4s, v12.4s, v6.4s\n"
        "fmin v4.4s, v4.4s, v6.4s\n"
        "fmin v26.4s, v26.4s, v6.4s\n"
        "blt 6f\n"
        "mov x20, %x[dst]\n"
        "str q13, [x20, #0x0]\n"
        "str q22, [x20, #0x10]\n"
        "add x20, x20, %x[dst_stride_row]\n"
        "str q11, [x20, #0x0]\n"
        "str q15, [x20, #0x10]\n"
        "add x20, x20, %x[dst_stride_row]\n"
        "str q14, [x20, #0x0]\n"
        "str q17, [x20, #0x10]\n"
        "add x20, x20, %x[dst_stride_row]\n"
        "str q1, [x20, #0x0]\n"
        "str q8, [x20, #0x10]\n"
        "add x20, x20, %x[dst_stride_row]\n"
        "str q30, [x20, #0x0]\n"
        "str q28, [x20, #0x10]\n"
        "add x20, x20, %x[dst_stride_row]\n"
        "str q10, [x20, #0x0]\n"
        "str q27, [x20, #0x10]\n"
        "add x20, x20, %x[dst_stride_row]\n"
        "str q18, [x20, #0x0]\n"
        "str q12, [x20, #0x10]\n"
        "add x20, x20, %x[dst_stride_row]\n"
        "str q4, [x20, #0x0]\n"
        "str q26, [x20, #0x10]\n"
        "b 11f\n"
        "6:"  // Partial output
        "mov x27, %x[dst]\n"
        "add x26, x27, %x[dst_stride_row], LSL #2\n"
        "add x25, x26, %x[dst_stride_row], LSL #1\n"
        "add x24, x26, %x[dst_stride_row]\n"
        "add x23, x25, %x[dst_stride_row]\n"
        "add x22, x27, %x[dst_stride_row], LSL #1\n"
        "add x21, x27, %x[dst_stride_row]\n"
        "add x20, x22, %x[dst_stride_row]\n"
        "tbz x9, #2, 8f\n"
        "st1 { v4.4s }, [x23], #0x10\n"
        "st1 { v18.4s }, [x25], #0x10\n"
        "st1 { v10.4s }, [x24], #0x10\n"
        "st1 { v30.4s }, [x26], #0x10\n"
        "st1 { v1.4s }, [x20], #0x10\n"
        "st1 { v14.4s }, [x22], #0x10\n"
        "st1 { v11.4s }, [x21], #0x10\n"
        "st1 { v13.4s }, [x27], #0x10\n"
        "tbz x9, #1, 7f\n"
        "st1 { v26.d }[0], [x23], #0x8\n"
        "st1 { v12.d }[0], [x25], #0x8\n"
        "st1 { v27.d }[0], [x24], #0x8\n"
        "st1 { v28.d }[0], [x26], #0x8\n"
        "st1 { v8.d }[0], [x20], #0x8\n"
        "st1 { v17.d }[0], [x22], #0x8\n"
        "st1 { v15.d }[0], [x21], #0x8\n"
        "st1 { v22.d }[0], [x27], #0x8\n"
        "tbz x9, #0, 10f\n"
        "st1 { v26.s }[2], [x23]\n"
        "st1 { v12.s }[2], [x25]\n"
        "st1 { v27.s }[2], [x24]\n"
        "st1 { v28.s }[2], [x26]\n"
        "st1 { v8.s }[2], [x20]\n"
        "st1 { v17.s }[2], [x22]\n"
        "st1 { v15.s }[2], [x21]\n"
        "st1 { v22.s }[2], [x27]\n"
        "b 10f\n"
        "7:"  // Output block 0: partial_1_4
        "tbz x9, #0, 10f\n"
        "st1 { v26.s }[0], [x23]\n"
        "st1 { v12.s }[0], [x25]\n"
        "st1 { v27.s }[0], [x24]\n"
        "st1 { v28.s }[0], [x26]\n"
        "st1 { v8.s }[0], [x20]\n"
        "st1 { v17.s }[0], [x22]\n"
        "st1 { v15.s }[0], [x21]\n"
        "st1 { v22.s }[0], [x27]\n"
        "b 10f\n"
        "8:"  // Output block 0: partial_2_0
        "tbz x9, #1, 9f\n"
        "st1 { v4.d }[0], [x23], #0x8\n"
        "st1 { v18.d }[0], [x25], #0x8\n"
        "st1 { v10.d }[0], [x24], #0x8\n"
        "st1 { v30.d }[0], [x26], #0x8\n"
        "st1 { v1.d }[0], [x20], #0x8\n"
        "st1 { v14.d }[0], [x22], #0x8\n"
        "st1 { v11.d }[0], [x21], #0x8\n"
        "st1 { v13.d }[0], [x27], #0x8\n"
        "tbz x9, #0, 10f\n"
        "st1 { v4.s }[2], [x23]\n"
        "st1 { v18.s }[2], [x25]\n"
        "st1 { v10.s }[2], [x24]\n"
        "st1 { v30.s }[2], [x26]\n"
        "st1 { v1.s }[2], [x20]\n"
        "st1 { v14.s }[2], [x22]\n"
        "st1 { v11.s }[2], [x21]\n"
        "st1 { v13.s }[2], [x27]\n"
        "b 10f\n"
        "9:"  // Output block 0: partial_1_0
        "st1 { v4.s }[0], [x23]\n"
        "st1 { v18.s }[0], [x25]\n"
        "st1 { v10.s }[0], [x24]\n"
        "st1 { v30.s }[0], [x26]\n"
        "st1 { v1.s }[0], [x20]\n"
        "st1 { v14.s }[0], [x22]\n"
        "st1 { v11.s }[0], [x21]\n"
        "st1 { v13.s }[0], [x27]\n"
        "10:"  // Output block 0: Done
        "11:"  // Output stage exit
        "subs x9, x9, #0x8\n"
        "add %x[dst], %x[dst], #0x20\n"
        "bgt 2b\n"
        "mov x20, #0x2\n"
        "sub x12, x12, #0x8\n"
        "cmp x12, #0x8\n"
        "mov %x[dst], x28\n"
        "madd %x[lhs_packed], x20, x11, %x[lhs_packed]\n"
        "bge 1b\n"
        "12:"  // Row loop skip
        "cbz x12, 23f\n"
        "13:"  // Row tail: Row loop
        "mov x26, %x[rhs_packed]\n"
        "mov x25, %x[n]\n"
        "add x24, %x[dst], %x[dst_stride_row], LSL #2\n"
        "14:"  // Row tail: Column loop
        "mov x22, %x[lhs_packed]\n"
        "movi v13.4s, #0x0\n"
        "movi v22.4s, #0x0\n"
        "mov x20, %x[num_blocks]\n"
        "movi v11.4s, #0x0\n"
        "movi v15.4s, #0x0\n"
        "movi v14.4s, #0x0\n"
        "movi v17.4s, #0x0\n"
        "movi v1.4s, #0x0\n"
        "movi v8.4s, #0x0\n"
        "15:"  // Row tail: Sub block loop
        "ldr q16, [x26, #0x0]\n"
        "ldr q7, [x26, #0x10]\n"
        "subs x20, x20, #0x1\n"
        "ldr q6, [x26, #0x20]\n"
        "ldr q5, [x26, #0x30]\n"
        "ldr q4, [x22, #0x0]\n"
        "ldr q9, [x22, #0x10]\n"
        "ldr q10, [x26, #0x40]\n"
        "ldr q3, [x26, #0x50]\n"
        "shl v0.16b, v16.16b, #0x4\n"
        "shl v19.16b, v7.16b, #0x4\n"
        "ldr q31, [x26, #0x60]\n"
        "ldr q27, [x26, #0x70]\n"
        "shl v18.16b, v6.16b, #0x4\n"
        "shl v12.16b, v5.16b, #0x4\n"
        "ldr q29, [x22, #0x20]\n"
        "ldr q28, [x22, #0x30]\n"
        "and v16.16b, v16.16b, v24.16b\n"
        "and v7.16b, v7.16b, v24.16b\n"
        "ldr q2, [x22, #0x40]\n"
        "ldr q23, [x22, #0x50]\n"
        ".inst 0x4e80a48d  // smmla v13.4s, v4.16b, v0.16b\n"
        ".inst 0x4e93a48b  // smmla v11.4s, v4.16b, v19.16b\n"
        "ldr q30, [x22, #0x60]\n"
        "ldr q21, [x22, #0x70]\n"
        ".inst 0x4e92a496  // smmla v22.4s, v4.16b, v18.16b\n"
        ".inst 0x4e8ca48f  // smmla v15.4s, v4.16b, v12.16b\n"
        ".inst 0x4e80a52e  // smmla v14.4s, v9.16b, v0.16b\n"
        ".inst 0x4e93a521  // smmla v1.4s, v9.16b, v19.16b\n"
        "shl v20.16b, v10.16b, #0x4\n"
        "add x26, x26, #0x80\n"
        ".inst 0x4e92a531  // smmla v17.4s, v9.16b, v18.16b\n"
        ".inst 0x4e8ca528  // smmla v8.4s, v9.16b, v12.16b\n"
        "shl v19.16b, v3.16b, #0x4\n"
        "add x22, x22, #0x80\n"
        "shl v18.16b, v31.16b, #0x4\n"
        "shl v12.16b, v27.16b, #0x4\n"
        ".inst 0x4e94a7ad  // smmla v13.4s, v29.16b, v20.16b\n"
        "and v6.16b, v6.16b, v24.16b\n"
        "and v5.16b, v5.16b, v24.16b\n"
        ".inst 0x4e93a7ab  // smmla v11.4s, v29.16b, v19.16b\n"
        ".inst 0x4e94a78e  // smmla v14.4s, v28.16b, v20.16b\n"
        ".inst 0x4e93a781  // smmla v1.4s, v28.16b, v19.16b\n"
        "and v10.16b, v10.16b, v24.16b\n"
        ".inst 0x4e92a7b6  // smmla v22.4s, v29.16b, v18.16b\n"
        ".inst 0x4e8ca7af  // smmla v15.4s, v29.16b, v12.16b\n"
        "and v3.16b, v3.16b, v24.16b\n"
        ".inst 0x4e92a791  // smmla v17.4s, v28.16b, v18.16b\n"
        ".inst 0x4e8ca788  // smmla v8.4s, v28.16b, v12.16b\n"
        "and v31.16b, v31.16b, v24.16b\n"
        ".inst 0x4e90a44d  // smmla v13.4s, v2.16b, v16.16b\n"
        ".inst 0x4e87a44b  // smmla v11.4s, v2.16b, v7.16b\n"
        "and v27.16b, v27.16b, v24.16b\n"
        ".inst 0x4e90a6ee  // smmla v14.4s, v23.16b, v16.16b\n"
        ".inst 0x4e87a6e1  // smmla v1.4s, v23.16b, v7.16b\n"
        ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
        ".inst 0x4e85a44f  // smmla v15.4s, v2.16b, v5.16b\n"
        ".inst 0x4e86a6f1  // smmla v17.4s, v23.16b, v6.16b\n"
        ".inst 0x4e85a6e8  // smmla v8.4s, v23.16b, v5.16b\n"
        ".inst 0x4e8aa7cd  // smmla v13.4s, v30.16b, v10.16b\n"
        ".inst 0x4e83a7cb  // smmla v11.4s, v30.16b, v3.16b\n"
        ".inst 0x4e8aa6ae  // smmla v14.4s, v21.16b, v10.16b\n"
        ".inst 0x4e83a6a1  // smmla v1.4s, v21.16b, v3.16b\n"
        ".inst 0x4e9fa7d6  // smmla v22.4s, v30.16b, v31.16b\n"
        ".inst 0x4e9ba7cf  // smmla v15.4s, v30.16b, v27.16b\n"
        ".inst 0x4e9fa6b1  // smmla v17.4s, v21.16b, v31.16b\n"
        ".inst 0x4e9ba6a8  // smmla v8.4s, v21.16b, v27.16b\n"
        "bgt 15b\n"
        "ldr q21, [x26, #0x0]\n"
        "ldr q20, [x26, #0x10]\n"
        "uzp1 v9.2d, v13.2d, v11.2d\n"
        "uzp2 v2.2d, v13.2d, v11.2d\n"
        "ld1 { v19.4s }, [x22]\n"
        "ldr q27, [x26, #0x20]\n"
        "uzp1 v0.2d, v22.2d, v15.2d\n"
        "uzp2 v31.2d, v22.2d, v15.2d\n"
        "ldr q13, [x26, #0x30]\n"
        "uzp1 v29.2d, v14.2d, v1.2d\n"
        "uzp2 v10.2d, v14.2d, v1.2d\n"
        "add x22, x22, #0x10\n"
        "ldr q23, [x22, #0x0]\n"
        "uzp1 v5.2d, v17.2d, v8.2d\n"
        "uzp2 v18.2d, v17.2d, v8.2d\n"
        "add x26, x26, #0x40\n"
        "mla v9.4s, v21.4s, v19.s[0]\n"
        "mla v0.4s, v20.4s, v19.s[0]\n"
        "mla v2.4s, v21.4s, v19.s[1]\n"
        "mla v31.4s, v20.4s, v19.s[1]\n"
        "mla v29.4s, v21.4s, v19.s[2]\n"
        "mla v5.4s, v20.4s, v19.s[2]\n"
        "fmul v30.4s, v27.4s, v23.s[0]\n"
        "mla v10.4s, v21.4s, v19.s[3]\n"
        "mla v18.4s, v20.4s, v19.s[3]\n"
        "fmul v17.4s, v13.4s, v23.s[0]\n"
        "scvtf v9.4s, v9.4s\n"
        "scvtf v0.4s, v0.4s\n"
        "fmul v21.4s, v27.4s, v23.s[1]\n"
        "scvtf v2.4s, v2.4s\n"
        "fmul v20.4s, v13.4s, v23.s[1]\n"
        "scvtf v31.4s, v31.4s\n"
        "fmul v19.4s, v27.4s, v23.s[2]\n"
        "scvtf v29.4s, v29.4s\n"
        "fmul v28.4s, v13.4s, v23.s[2]\n"
        "scvtf v5.4s, v5.4s\n"
        "fmul v26.4s, v27.4s, v23.s[3]\n"
        "scvtf v10.4s, v10.4s\n"
        "fmul v16.4s, v13.4s, v23.s[3]\n"
        "scvtf v18.4s, v18.4s\n"
        "fmul v13.4s, v9.4s, v30.4s\n"
        "fmul v22.4s, v0.4s, v17.4s\n"
        "fmul v11.4s, v2.4s, v21.4s\n"
        "fmul v15.4s, v31.4s, v20.4s\n"
        "fmul v14.4s, v29.4s, v19.4s\n"
        "fmul v17.4s, v5.4s, v28.4s\n"
        "fmul v1.4s, v10.4s, v26.4s\n"
        "fmul v8.4s, v18.4s, v16.4s\n"
        "ldr q19, [x26, #0x0]\n"
        "ldr q18, [x26, #0x10]\n"
        "add x20, %x[clamp_vals], #0x4\n"
        "cmp x25, #0x8\n"
        "ld1r { v20.4s }, [%x[clamp_vals]]\n"
        "ld1r { v27.4s }, [x20]\n"
        "add x26, x26, #0x20\n"
        "fadd v13.4s, v13.4s, v19.4s\n"
        "fadd v22.4s, v22.4s, v18.4s\n"
        "fadd v11.4s, v11.4s, v19.4s\n"
        "fadd v15.4s, v15.4s, v18.4s\n"
        "fadd v14.4s, v14.4s, v19.4s\n"
        "fadd v17.4s, v17.4s, v18.4s\n"
        "fadd v1.4s, v1.4s, v19.4s\n"
        "fadd v8.4s, v8.4s, v18.4s\n"
        "fmax v13.4s, v13.4s, v20.4s\n"
        "fmax v22.4s, v22.4s, v20.4s\n"
        "fmax v11.4s, v11.4s, v20.4s\n"
        "fmax v15.4s, v15.4s, v20.4s\n"
        "fmax v14.4s, v14.4s, v20.4s\n"
        "fmax v17.4s, v17.4s, v20.4s\n"
        "fmax v1.4s, v1.4s, v20.4s\n"
        "fmax v8.4s, v8.4s, v20.4s\n"
        "fmin v13.4s, v13.4s, v27.4s\n"
        "fmin v22.4s, v22.4s, v27.4s\n"
        "fmin v11.4s, v11.4s, v27.4s\n"
        "fmin v15.4s, v15.4s, v27.4s\n"
        "fmin v14.4s, v14.4s, v27.4s\n"
        "fmin v17.4s, v17.4s, v27.4s\n"
        "fmin v1.4s, v1.4s, v27.4s\n"
        "fmin v8.4s, v8.4s, v27.4s\n"
        "blt 17f\n"
        "mov x20, %x[dst]\n"
        "cmp x12, #0x1\n"
        "str q13, [x20, #0x0]\n"
        "str q22, [x20, #0x10]\n"
        "add x20, x20, %x[dst_stride_row]\n"
        "ble 22f\n"
        "cmp x12, #0x2\n"
        "str q11, [x20, #0x0]\n"
        "str q15, [x20, #0x10]\n"
        "add x20, x20, %x[dst_stride_row]\n"
        "ble 22f\n"
        "cmp x12, #0x3\n"
        "str q14, [x20, #0x0]\n"
        "str q17, [x20, #0x10]\n"
        "add x20, x20, %x[dst_stride_row]\n"
        "ble 22f\n"
        "str q1, [x20, #0x0]\n"
        "str q8, [x20, #0x10]\n"
        "b 22f\n"
        "17:"  // Row tail: Partial output
        "mov x23, %x[dst]\n"
        "cmp x12, #0x1\n"
        "add x22, x23, %x[dst_stride_row]\n"
        "csel x22, x22, x23, GT\n"
        "cmp x12, #0x2\n"
        "add x21, x23, %x[dst_stride_row], LSL #1\n"
        "csel x21, x21, x22, GT\n"
        "cmp x12, #0x3\n"
        "add x20, x21, %x[dst_stride_row]\n"
        "csel x20, x20, x21, GT\n"
        "tbz x25, #2, 19f\n"
        "st1 { v1.4s }, [x20], #0x10\n"
        "st1 { v14.4s }, [x21], #0x10\n"
        "st1 { v11.4s }, [x22], #0x10\n"
        "st1 { v13.4s }, [x23], #0x10\n"
        "tbz x25, #1, 18f\n"
        "st1 { v8.d }[0], [x20], #0x8\n"
        "st1 { v17.d }[0], [x21], #0x8\n"
        "st1 { v15.d }[0], [x22], #0x8\n"
        "st1 { v22.d }[0], [x23], #0x8\n"
        "tbz x25, #0, 21f\n"
        "st1 { v8.s }[2], [x20]\n"
        "st1 { v17.s }[2], [x21]\n"
        "st1 { v15.s }[2], [x22]\n"
        "st1 { v22.s }[2], [x23]\n"
        "b 21f\n"
        "18:"  // Row tail: Output block 0: partial_1_4
        "tbz x25, #0, 21f\n"
        "st1 { v8.s }[0], [x20]\n"
        "st1 { v17.s }[0], [x21]\n"
        "st1 { v15.s }[0], [x22]\n"
        "st1 { v22.s }[0], [x23]\n"
        "b 21f\n"
        "19:"  // Row tail: Output block 0: partial_2_0
        "tbz x25, #1, 20f\n"
        "st1 { v1.d }[0], [x20], #0x8\n"
        "st1 { v14.d }[0], [x21], #0x8\n"
        "st1 { v11.d }[0], [x22], #0x8\n"
        "st1 { v13.d }[0], [x23], #0x8\n"
        "tbz x25, #0, 21f\n"
        "st1 { v1.s }[2], [x20]\n"
        "st1 { v14.s }[2], [x21]\n"
        "st1 { v11.s }[2], [x22]\n"
        "st1 { v13.s }[2], [x23]\n"
        "b 21f\n"
        "20:"  // Row tail: Output block 0: partial_1_0
        "st1 { v1.s }[0], [x20]\n"
        "st1 { v14.s }[0], [x21]\n"
        "st1 { v11.s }[0], [x22]\n"
        "st1 { v13.s }[0], [x23]\n"
        "21:"  // Row tail: Output block 0: Done
        "22:"  // Row tail: Output stage exit
        "subs x25, x25, #0x8\n"
        "add %x[dst], %x[dst], #0x20\n"
        "bgt 14b\n"
        "subs x12, x12, #0x4\n"
        "add %x[lhs_packed], %x[lhs_packed], x11\n"
        "mov %x[dst], x24\n"
        "bgt 13b\n"
        "23:"  // Row tail: Row loop skip
        : [dst] "+&r"(dst), [lhs_packed] "+&r"(lhs_packed)
        : [clamp_vals] "r"(clamp_vals), [dst_stride_row] "r"(dst_stride_row), [m] "r"(m), [n] "r"(n),
          [num_blocks] "r"(num_blocks), [rhs_packed] "r"(rhs_packed)
        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
          "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
          "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
}
#endif  // Architectural feature check
